Announcements

Imports

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(rpart)
# install.packages('transformr')
library(gganimate)
library('transformr')

Toy Data Set

# standard base R functionality
n <- 30
support_range <- 5
x <- support_range*runif(n=n)
x <- x-mean(x)
y <- x+runif(n=n)

# ALWAYS take the time to get this right for your audience!!
# http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/#a-colorblind-friendly-palette
cbbPalette <- c("#009E73", "#CC79A7","#D55E00", "#000000",
                "#56B4E9", "#F0E442", "#0072B2", "#E69F00")

data_space_plot <- tibble(x=x,y=y) %>% 
  ggplot(aes(x=x,y=y,color='Data')) + geom_point() +
    scale_colour_manual(values=cbbPalette) 

data_space_plot + ggtitle('"Data Space"')

X <- tibble(intercept=1,x=x)
X
## # A tibble: 30 x 2
##    intercept       x
##        <dbl>   <dbl>
##  1         1  1.64  
##  2         1  0.0227
##  3         1  1.70  
##  4         1 -1.79  
##  5         1 -1.26  
##  6         1 -1.85  
##  7         1  0.512 
##  8         1 -0.0223
##  9         1 -2.35  
## 10         1  0.856 
## # … with 20 more rows
Beta <- tibble( estimate = c(0,0) )
Beta 
## # A tibble: 2 x 1
##   estimate
##      <dbl>
## 1        0
## 2        0
tibble( yhat = as.matrix(X) %*% as.matrix(Beta) )
## # A tibble: 30 x 1
##    yhat[,"estimate"]
##                <dbl>
##  1                 0
##  2                 0
##  3                 0
##  4                 0
##  5                 0
##  6                 0
##  7                 0
##  8                 0
##  9                 0
## 10                 0
## # … with 20 more rows

Gradient Descent

\[\LARGE \begin{align*} \sum_{i=1}^n (y_i - \hat y_i)^2 = {} & (\mathbf{y}-\mathbf{X} {\boldsymbol \beta})^T(\mathbf{y}-\mathbf{X} {\boldsymbol \beta})\\ = {} & \mathbf{y}^T\mathbf{y}-2{\boldsymbol \beta}^T \mathbf{X}^T\mathbf{y} + {\boldsymbol \beta}^T \mathbf{X}^T \mathbf{X} {\boldsymbol \beta}\\ \nabla_{\mathbf{\boldsymbol \beta}} \frac{1}{2} \left( \mathbf{y}^T\mathbf{y}-2{\boldsymbol \beta}^T \mathbf{X}^T\mathbf{y}+ {\boldsymbol \beta}^T \mathbf{X}^T \mathbf{X} {\boldsymbol \beta}\right) = {} & \frac{1}{2}(-2 \mathbf{X}^T\mathbf{y} + 2 \mathbf{X}^T \mathbf{X} {\boldsymbol \beta}) \\ = {} & \mathbf{X}^T \mathbf{X} {\boldsymbol \beta} - \mathbf{X}^T \mathbf{y}\\ = {} & \mathbf{X}^T \left(\mathbf{X} {\boldsymbol \beta} - \mathbf{y}\right) \end{align*}\]

Gradient Descent in “Parameter” Space

initiallize gradient descent algorithm

Beta <- 0*Beta+2

alpha_learning_rate <- 0.01
step = 1
Betas_pars <- Beta %>% rename_at(step, 
                                 function(x) paste0(x, as.character(step), sep=''))
loss_pars <- c(sum((as.matrix(X)%*%as.matrix(Beta)-as.matrix(y))^2))

take some gradient descent steps

  • this is repeatedly manually run (10 times)
# https://en.wikipedia.org/wiki/Least_squares#Linear_least_squares
Beta <- Beta - alpha_learning_rate*t(as.matrix(X))%*%
  (as.matrix(X)%*%as.matrix(Beta)-as.matrix(y))
# https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/append
loss_pars <- append(loss_pars,
                    sum((as.matrix(X)%*%as.matrix(Beta)-as.matrix(y))^2))

step = step+1                        
Betas_pars %>% add_column(Beta) %>% 
  rename_at(step, 
            function(x) paste0(x,as.character(step), sep='')) -> Betas_pars

# https://tibble.tidyverse.org/reference/add_row.html
Betas_pars %>% add_row() -> tmp
tmp[3,]<-loss_pars
tmp
##   estimate1 estimate2 estimate3 estimate4 estimate5 estimate6 estimate7
## 1    2.0000  1.557250  1.247324  1.030376  0.878513 0.7722086 0.6977955
## 2    2.0000  1.535938  1.286704  1.152848  1.080957 1.0423472 1.0216107
## 3  114.3721 47.946070 22.069016 11.313687  6.598669 4.4484233 3.4409871
##   estimate8 estimate9 estimate10
## 1 0.6457064  0.609244  0.5837203
## 2 1.0104738  1.004492  1.0012800
## 3 2.9606650  2.729150  2.6168156

create a cost function visualization

# https://stackoverflow.com/questions/42158198/r-equivalent-of-pythons-np-dot-for-3d-array
# https://stackoverflow.com/questions/46843926/broadcasting-in-r/46845541#46845541
# https://stackoverflow.com/questions/37034623/simplest-way-to-repeat-matrix-along-3rd-dimension

two_var_cost_func <- function(x1, x2, y, par1.grid, par2.grid){
  n <- length(y)
  par1.x1 <- sweep(replicate(n, par1.grid), MARGIN=3, FUN='*', x1) 
  par2.x2 <- sweep(replicate(n, par2.grid), MARGIN=3, FUN='*', x2) 
  resid <- sweep(par1.x1+par2.x2, 3, y)
  SSE <- (resid^2)
  rowSums(SSE, dim=2)
}

range <- 500
Beta0_hat <- seq(-range,3*range,range/100)/range #seq(-33,33,length.out=300)#
Beta1_hat <- seq(-range,3*range,range/100)/range #seq(-100,100,length.out=300)#
Beta0_hat.grid <- outer(Beta0_hat, Beta1_hat, function(x1, x2) x1 )
Beta1_hat.grid <- outer(Beta0_hat, Beta1_hat, function(x1, x2) x2 )

SSE <- two_var_cost_func(x1=x*0+1, x2=x, y=y, 
                         par1.grid=Beta0_hat.grid, par2.grid=Beta1_hat.grid)

# https://plotly.com/r/figure-labels/
# https://github.com/plotly/plotly.js/issues/608
plot_ly(x=Beta0_hat.grid, y=Beta1_hat.grid, z=SSE, type='surface') %>%
  layout(title = '\n\nCost Function in\n"Model/Parameter Space"',
         scene = list(xaxis=list(title="β₀"),
                      yaxis=list(title = "β"),
                      zaxis=list(title = "SSE"))) -> cost_function

visualize the gradient descent path

# https://stackoverflow.com/questions/44619198/r-plot-ly-3d-graph-with-trace-line
# https://plotly.com/r/line-and-scatter/
cost_function %>%
  add_trace(x = unlist(c(Betas_pars[1,])), 
            y = unlist(c(Betas_pars[2,])),
            z = loss_pars, 
            type = "scatter3d",
            mode = "lines+markers",
            line = list(color = "red", width = 4)) -> parameter_space_plot
parameter_space_plot
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Project the model space path into data space

(when run this live we can click through thumbnails of each of these figures)

set_names(1:10) %>% map(~ data_space_plot + 
                          geom_abline(aes(color='Model',
                                          intercept = Betas_pars[1,.x], 
                                          slope = Betas_pars[2,.x])) +
                           ggtitle('Viewing a point projection from "Model Space" into "Data Space"'))
## $`1`

## 
## $`2`

## 
## $`3`

## 
## $`4`

## 
## $`5`

## 
## $`6`

## 
## $`7`

## 
## $`8`

## 
## $`9`

## 
## $`10`

Gradient Descent Re-Imagined

# http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/#a-colorblind-friendly-palette
cbbPalette <- c("#009E73", "#CC79A7","#D55E00", "#000000",
                "#56B4E9", "#F0E442", "#0072B2", "#E69F00")
 
Beta <- 0*Beta-1

# https://www.r-graph-gallery.com/line-chart-several-groups-ggplot2.html
data_space_plot + geom_abline(aes(color='Model',
                                  intercept = Beta[1,], 
                                  slope = Beta[2,])) +
    geom_point(data=tibble(x=x, yhat=as.matrix(X)%*%as.matrix(Beta)),
               mapping=aes(x=x, y=yhat, color='Predictions')) +
    geom_line(data=tibble(x=rep(x,2),
                          group=x,
                          yhat=as.matrix(X)%*%as.matrix(Beta) %>% 
                               as.tibble() %>% 
                               bind_rows(tibble(estimate=y))), 
              mapping=aes(x=x, group=group, y=yhat$estimate, 
                          color='Gradients')) +
  ggtitle('Re-Imagining "Data Spece" as "Prediction Space"...
           ...so it\'s now a "Model" or "Parameter" space visualization')
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Revised Gradient Descent Specification

  1. The prediction IS the model…
  2. The gradient of the cost function with respect to the model, then, is:

\[\LARGE \begin{align*} \sum_{i=1}^n (y_i - \hat y_i)^2 = {} & (\mathbf{y}-\mathbf{\mathbf{\hat y}})^T(\mathbf{y}-\mathbf{\mathbf{\hat y}})\\ = {} & \mathbf{y}^T\mathbf{y}-2\mathbf{y}^T \mathbf{\hat y}+\mathbf{\hat y}^T\mathbf{\hat y}\\ \nabla_{\mathbf{\hat y}} \frac{1}{2} (\mathbf{\hat y}-\mathbf{\hat y})^T(\mathbf{\hat y}-\mathbf{\hat y}) = {} & \nabla_{\mathbf{\hat y}} \frac{1}{2}(\mathbf{\hat y}^T\mathbf{\hat y} -2\mathbf{\hat y}^T \mathbf{ y}+ \mathbf{ y}^T\mathbf{ y}) \\ = {} & \frac{1}{2}(2 \mathbf{\hat y} -2\mathbf{ y}) = \mathbf{\hat y} - \mathbf{ y} \end{align*}\]

  1. To decrease the cost, go in the negative direction of the gradient, which points “up” relative to the cost function
  2. So if is larger than y, we should move in the negative direction
  • PollEv.com/scottschwart658
  • What is the parameter in \((\mathbf{y}-\mathbf{\mathbf{\hat y}})^T(\mathbf{y}-\mathbf{\mathbf{\hat y}})\)?
  • And it’s dimension?
  • What is \(\nabla_{\mathbf{\hat y}} \frac{1}{2} (\mathbf{\hat y}-\mathbf{\hat y})^T(\mathbf{\hat y}-\mathbf{\hat y}) = \mathbf{\hat y} - \mathbf{ y}\)?

Gradient Descent in “Prediction” Space

initiallize gradient descent algorithm

alpha_learning_rate <- 0.5
step = 1
Betas = Beta %>% rename_at(step, 
                           function(x) paste0(x, as.character(step), sep=''))
loss_pars <- c(sum((as.matrix(X)%*%as.matrix(Beta)-as.matrix(y))^2))

take some gradient descent steps

  • this is repeatedly manually run (10 times)
current_yhat <- as.matrix(X) %*% as.matrix(Beta)
new_yhat = current_yhat - alpha_learning_rate*(current_yhat-y)
Beta = lm(new_yhat~x) %>% broom::tidy() %>% select(estimate)
new_yhat = as.matrix(X) %*% as.matrix(Beta)

step = step+1                        
Betas %>% add_column(Beta) %>% 
  rename_at(step, 
            function(x) paste0(x, as.character(step), sep='')) -> Betas

loss_pars <- append(loss_pars,
                    sum((as.matrix(X)%*%as.matrix(Beta)-as.matrix(y))^2))

Betas %>% add_row() -> tmp
tmp[3,]<-loss_pars
tmp
##   estimate1    estimate2  estimate3 estimate4 estimate5 estimate6 estimate7
## 1   -1.0000 -0.237917494  0.1431238 0.3336444 0.4289047 0.4765349 0.5003499
## 2   -1.0000 -0.001223438  0.4981648 0.7478590 0.8727061 0.9351296 0.9663414
## 3  256.9209 66.112546266 18.4104623 6.4849414 3.5035611 2.7582160 2.5718798
##   estimate8 estimate9 estimate10
## 1 0.5122575 0.5182112  0.5211881
## 2 0.9819472 0.9897502  0.9936517
## 3 2.5252957 2.5136497  2.5107382

Gradient Descent in “Prediction” Space

set_names(1:10) %>% map(~ data_space_plot + 
                          geom_abline(aes(color='Model',
                                          intercept=Betas[1,.x], 
                                          slope=Betas[2,.x])) +
                          geom_point(data=tibble(
                                          x=x, 
                                          yhat=as.matrix(X)%*%as.matrix(Betas[,.x])), 
                                     mapping=aes(x=x, y=yhat, 
                                                 color='Predictions')) +
                          geom_line(data=tibble(
                                         x=rep(x,2),
                                         group=x,
                                         yhat=as.matrix(X)%*%as.matrix(Betas[,.x]) %>% 
                                         as.tibble() %>% 
                                         bind_rows(tibble(V1=y))), 
                                    mapping=aes(x=x, group=group, 
                                                y=yhat$V1, 
                                                color='Gradients')) +
                          ggtitle('Path of Gradient Descent through "Prediction Space"'))
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## $`1`

## 
## $`2`

## 
## $`3`

## 
## $`4`

## 
## $`5`

## 
## $`6`

## 
## $`7`

## 
## $`8`

## 
## $`9`

## 
## $`10`

Gradient Descent in “Parameter” Space

parameter_space_plot %>%
  add_trace(x = unlist(c(Betas[1,])), 
            y = unlist(c(Betas[2,])),
            z = loss_pars, 
            type = "scatter3d",
            mode = "lines+markers",
            line = list(color = "red", width = 4))

Gradient Boosting with Gradient Boosted Trees

\[ \LARGE \begin{align*} \mathbf{\hat y}_t = {}& \mathbf{\hat y}_{t-1}+\alpha (\mathbf{ y-\hat y}_{t-1})\\ \mathbf{\hat y}_t = {}& \mathbf{\hat y}_{t-1}+\alpha {\boldsymbol \epsilon}_{t-1}\\ \mathbf{\hat y}_t = {}& \mathbf{\hat y}_{t-1}+\alpha f({\boldsymbol \epsilon}_{t-1})\\ \mathbf{\hat y}_t = {}& \mathbf{\hat y}_{t-1}+\alpha \boldsymbol{\hat \epsilon}_{t-1} \end{align*} \] - \(\boldsymbol{\hat \epsilon}_{t-1}\) is a model - \(\mathbf{\hat y}_{t-1}\) is a model - \(\mathbf{\hat y}_{t} = \mathbf{\hat y}_{t-1}+\alpha \boldsymbol{\hat \epsilon}_{t-1}\) is another model - and \(\mathbf{\hat y}_{t}\) is closer to \(y\) than \(\mathbf{\hat y}_{t-1}\) was…

f0 <- function(x){
  0*x
}
support <- seq(min(x), max(x), .01)

step=1
fhat_model <- list(f0(support))
fhat_datas <- f0(x)
data_space_plot + geom_line(data=tibble(x=support, y=fhat_model[[step]], 
                                        color='GBT'),
                            mapping=aes(x=x,y=y,color=color))

y_leftover <- y
# https://www.statmethods.net/advstats/cart.html
# https://www.rdocumentation.org/packages/rpart/versions/4.1-15/topics/predict.rpart
alpha_learning_rate=0.1

y_leftover <- y_leftover-fhat_datas

stump <- rpart(y~x, data=tibble(x=x, y=y_leftover), 
               control=rpart.control(maxdepth=2, minbucket=1, minsplit=2, cp=0))
fhat_datas <- alpha_learning_rate*predict(stump) 
fhat_model <- append(fhat_model, 
                     list(fhat_model[[step]] + alpha_learning_rate *
                          as.numeric(predict(stump, newdata=tibble(x=support)))))
step = step+1

data_space_plot + geom_line(data=tibble(x=support, y=fhat_model[[step]], 
                                        color='GBT'),
                            mapping=aes(x=x,y=y,color=color))

data_space_plot + geom_line(data=tibble(x=support, y=fhat_model[[step]], 
                                        color='GBT'),
                            mapping=aes(x=x,y=y,color=color))

# https://www.datanovia.com/en/blog/gganimate-how-to-create-plots-with-beautiful-animation-in-r/

# https://tidyr.tidyverse.org/reference/pivot_longer.html
animation <- fhat_model %>% 
  map(~ tibble(.x)) %>% 
  bind_cols() %>% 
  add_column(tibble(x=support)) %>% 
  pivot_longer(cols=starts_with(".x"))
## New names:
## * .x -> .x...1
## * .x -> .x...2
## * .x -> .x...3
## * .x -> .x...4
## * .x -> .x...5
## * ...
# https://www.rdocumentation.org/packages/stringr/versions/1.4.0/topics/str_replace

animation %>% mutate(name=as.integer(str_replace(name,'.x...',''))) %>%
  ggplot(aes(x=x, y=value)) + geom_line() +
  transition_time(name) + labs(title = "Step: {frame_time}") +
  geom_point(data=tibble(x=x,y=y), mapping=aes(x,y))